# Clear global environment
rm(list=ls())
# Load packages
library(tidyverse) # for data organization and manipulation
library(janitor) # for data cleaning
library(openxlsx) # for reading in Excel files
library(RUVSeq) # for exploring data normalization
library(factoextra) # for PCA
library(pheatmap) # for heatmaps
library(ggpubr) # for making qqplots
library(rstatix) # for statistical tests
library(ggvenn) # for Venn Diagrams
library(snakecase) # for renaming columns
library(patchwork) # for plotting
rename <- dplyr::rename
select <- dplyr::select
# Set graphing theme
theme_set(theme_bw())
# Master data frame
all_data <- read.xlsx("1_InputData/Raw_Proteomics_Data_Cleaned.xlsx") %>%
clean_names() %>%
rename("protein" = "gene") %>%
# Replace spaces and special symbols with periods so that accessions can be used in row names
mutate(accession = gsub("[^[:alnum:]]", ".", accession))
# How many total proteins were identified?
nrow(all_data)
## [1] 1023
# How many unique proteins/genes?
length(unique(all_data$protein))
## [1] 934
First, we will filter out proteins that had only 1 or 2 unique peptides, keeping those with 3 or more unique peptides.
data_filtered <- all_data %>%
# Filter by proteins that were identified by at least 3 peptides
filter(number_unique_peptides > 2) %>%
# Remove proteins called "[number] SV," which are actually peptides
# where a gene symbol or location cannot be assigned.
filter(!protein %in% c("1 SV", "2 SV", "3 SV", "4 SV"))
# How many total proteins/accessions?
nrow(data_filtered)
## [1] 669
# How many unique proteins/genes?
length(unique(data_filtered$protein))
## [1] 660
Next, we will filter for proteins that were present in at least 2 out of 3 samples for ONE of the isolation methods.
# Count how many NAs there are per protein and isolation method
na_count_bymethod <- data_filtered %>%
# Create transposed data frame
select(c(accession, h1_a35:h5_uc)) %>%
column_to_rownames("accession") %>%
t() %>% data.frame() %>%
# Create grouping variable and pivot data longer so that it can be grouped by method
rownames_to_column("sample_id") %>%
separate(sample_id, into = c("horse", "method"), remove = FALSE) %>%
pivot_longer(-c(sample_id:method), names_to = "accession", values_to = "value") %>%
# Summarize number of NAs by method and accession
group_by(method, accession) %>%
summarise(sum_na = sum(is.na(value))) %>%
# Determine whether each accession will be kept in the dataset by assigning it a yes or no
# based on whether any of the groups have 0 or 1 NAs
pivot_wider(id_cols = method, names_from = "accession", values_from = "sum_na") %>%
column_to_rownames("method") %>%
t() %>% data.frame()
na_count_bymethod <- na_count_bymethod %>%
mutate(keep = ifelse(apply(na_count_bymethod == 0, 1, any), "Y",
ifelse(apply(na_count_bymethod == 1, 1, any), "Y", "N")))
# Vector of accessions to keep
keep_accession <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(keep == "Y") %>%
pull(accession)
# Filter data
data_filtered <- data_filtered %>%
filter(accession %in% keep_accession)
# How many accessions?
nrow(data_filtered)
## [1] 564
# How many unique proteins?
length(unique(data_filtered$protein))
## [1] 556
# How many accessions have no missing data?
count_nomissing <- data_filtered %>%
select(c(accession, h1_a35:h5_uc)) %>%
na.omit()
nrow(count_nomissing)
## [1] 62
Making supplemental table that contains entire dataset with annotations
Here, we plot the distribution of log2 protein abundance for each sample (omitting all of the values that are NA by changing them to 0 and not including them in the axes limits). We can see that, for the non-missing proteins, log2 data follow a roughly normal distribution across all samples and that MC/UC samples had overall fewer proteins.
# Data
data_for_indiv_hist <- data_filtered %>%
mutate(across(h1_a35:h5_uc, log2)) %>%
select(c(accession, h1_a35:h5_uc)) %>%
column_to_rownames("accession") %>%
pivot_longer(all_of(everything()), names_to = "sample", values_to = "value") %>%
replace(is.na(.), 0)
indiv_hist_counts <- ggplot(data_for_indiv_hist, aes(x = value)) +
geom_histogram(color = "black",
fill = "gray60",
alpha = 0.7,
binwidth = 1) +
ggtitle("Protein Abundance Distribution by Sample") +
ylab("Number of Proteins") +
xlab("Log2(Protein Abundance)") +
theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(margin = ggplot2::margin(t = 10), size = 13),
axis.title.y = element_text(margin = ggplot2::margin(r = 10), size = 13),
axis.text = element_text(size = 12)) +
scale_x_continuous(limits = c(15,35)) +
facet_wrap(~sample)
indiv_hist_counts
## Warning: Removed 3097 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 24 rows containing missing values (`geom_bar()`).
# Prepare dataset
data_for_relexprplot <- data_filtered %>%
select(c(accession, h1_a35:h5_uc)) %>%
column_to_rownames("accession") %>%
t() %>% data.frame() %>%
rownames_to_column("sample_id") %>%
separate(sample_id, into = c("horse", "method"), remove = FALSE) %>%
mutate(across(c(horse:method), toupper)) %>%
select(-horse)
# Store IDs as a separate vector
ID <- data_for_relexprplot$sample_id
# Store conditions as a separate vector
groups <- as.factor(data_for_relexprplot$method)
# Prepare dataset
data_for_RLE <- data_filtered %>%
select(c(accession, h1_a35:h5_uc)) %>%
column_to_rownames("accession")
# Create SeqExpressionSet
exprSet <- newSeqExpressionSet(as.matrix(data_for_RLE),phenoData =
data.frame(groups,row.names=colnames(data_for_RLE)))
# Show groups whose distributions vary from the overall
colors <- c("#3b528b", "#21918c", "#5ec962", "#fde725")
plotRLE(exprSet, outline=FALSE, col = colors[groups])
Our relative log expression plot looks typical and does not suggest any batch effects or need for adjustment.
How many total NA values are there by method for the accessions kept by our detection filter?
# Sum of NA counts across all kept proteins by method
na_count_bymethod_sum <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(accession %in% keep_accession) %>%
select(c(a35:uc)) %>%
summarise_all(sum)
na_count_bymethod_sum
## a35 a70 mc uc
## 1 436 529 1024 1103
Histogram of NA counts by accession:
# Create input data
na_data_forhist <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(accession %in% keep_accession) %>%
select(c(a35:uc)) %>%
mutate(sum = a35 + a70 + mc + uc)
na_protein_hist <- ggplot(na_data_forhist, aes(x = sum)) +
geom_histogram(color = "black",
fill = "gray60",
alpha = 0.7,
binwidth = 1) +
ggtitle("Number of NAs per Accession") +
ylab("Number of NAs") +
xlab("Number of Accessions") +
scale_x_continuous(breaks = seq(0, 12, by = 1), limits = c(0, 12), expand = c(0.025, 0.025)) +
scale_y_continuous(breaks = seq(0, 100, by = 25), limits = c(0, 100), expand = c(0, 0)) +
theme(plot.title = element_text(hjust = 0.5, size = 16),
axis.title.x = element_text(margin = ggplot2::margin(t = 10), size = 13),
axis.title.y = element_text(margin = ggplot2::margin(r = 10), size = 13),
axis.text = element_text(size = 12))
na_protein_hist
Prepare elements needed for making heat maps:1) A function for making unique names for duplicate proteins and a 2) key for going between protein name and accession).
First, the function for making unique names:
# Function for making unique names
## Input = vector (x) and separation character(s) (sep) such as "_"
## Output = vector with duplicated elements renamed with .1, .2, .3 appended (for example)
make.unique = function(x, sep){
ave(x, x, FUN = function(a){if(length(a) > 1){paste(a, 1:length(a), sep = sep)} else {a}})
}
# Key to compare accessions and protein names
accession_prot_key <- data_filtered %>%
select(c(accession, protein))
Second, import MISEV annotation protein list and make list of subsets of contaminants:
# Read in MISEV annotation data
misev_prots <- read.xlsx("1_InputData/MISEV2018_Proteins.xlsx") %>% clean_names()
# Vector of proteins to filter by
misev_prots_cat3 <- misev_prots %>%
filter(category == "3 - Major components of non-EV co-isolated structures") %>%
pull(protein_symbol)
misev_prots_cat4 <- misev_prots %>%
filter(category == "4 - Transmembrane, lipid-bound and soluble proteins associated to other intracellular compartments than PM/edosomes") %>%
pull(protein_symbol)
corecontam_prots <- all_data %>%
filter(contaminant == TRUE) %>%
pull(protein)
Extract data for proteins from each of these lists in a format that is prepared for making a heatmap:
# Graphing data
data_forgraphing <- data_filtered %>%
select(c(accession, h1_a35:h5_uc)) %>%
mutate(across(h1_a35:h5_uc, log2)) %>%
column_to_rownames("accession") %>%
t() %>% data.frame() %>%
rownames_to_column("sample_id") %>%
separate(sample_id, into = c("horse", "method"), remove = FALSE) %>%
mutate(across(c(horse:method), toupper))
head(data_forgraphing)[,1:6]
## sample_id horse method P62327 F6S769 F7DVQ8
## 1 h1_a35 H1 A35 22.28357 22.60766 22.67475
## 2 h4_a35 H4 A35 22.44088 19.70321 24.71741
## 3 h5_a35 H5 A35 23.41957 22.71951 21.64302
## 4 h1_a70 H1 A70 NA 21.44733 22.16455
## 5 h4_a70 H4 A70 23.57941 22.61581 NA
## 6 h5_a70 H5 A70 23.18451 22.83262 21.83619
# Function for additional data cleaning
## Input: data frame formatted similar to data_forgraphing, vector of proteins to keep, and data frame of protein name/accession key
## Output: data frame formatted for heatmap
heatmap_data_prep_counts <- function(input_df, filtering_vec, protein_key){
# Get accessions
accession_df <- protein_key %>%
filter(protein %in% filtering_vec)
# Filter data
data <- input_df %>%
dplyr::select(c(sample_id, horse, method, all_of(accession_df$accession))) %>%
pivot_longer(-c(sample_id, horse, method), names_to = "accession", values_to = "value") %>%
left_join(accession_df, by = "accession") %>%
unite(protein_accession, protein, accession, sep = ".") %>%
group_by(method, protein_accession) %>%
summarise(sum = sum(!is.na(value))) %>%
pivot_wider(id_cols = "method", names_from = "protein_accession", values_from = "sum") %>%
mutate(method = recode(method, "A35" = "S35", "A70" = "S70")) %>%
mutate(method = factor(method, levels = c("S35", "S70", "MC", "UC"))) %>%
column_to_rownames("method") %>%
t() %>% data.frame() %>%
rownames_to_column("protein_accession") %>%
separate(protein_accession, into = c("protein", NA), sep = "[.]") %>%
mutate(protein = make.unique(protein, sep = "_Acc")) %>%
column_to_rownames("protein")
return(data)
}
Apply function for each of the protein subsets:
# Apply function
misev_cat3_forheatmap <- heatmap_data_prep_counts(data_forgraphing, misev_prots_cat3, accession_prot_key)
misev_cat4_forheatmap <- heatmap_data_prep_counts(data_forgraphing, misev_prots_cat4, accession_prot_key)
corecontam_forheatmap <- heatmap_data_prep_counts(data_forgraphing, corecontam_prots, accession_prot_key)
# Add group name to each data frame
misev_cat3_anno <- misev_cat3_forheatmap %>%
mutate(Category = "MISEV 3: Non-EV Co-Isolated Structures") %>%
select(Category)
misev_cat4_anno <- misev_cat4_forheatmap %>%
mutate(Category = "MISEV 4: Other Intracellular Compartments") %>%
select(Category)
corecontam_anno <- corecontam_forheatmap %>%
mutate(Category = "Non-Horse Proteins") %>%
select(Category)
# Bind data frames together
contaminant_heatmap_anno <- rbind(misev_cat3_anno, misev_cat4_anno, corecontam_anno) %>%
mutate(Category = factor(Category, levels = c("MISEV 3: Non-EV Co-Isolated Structures",
"MISEV 4: Other Intracellular Compartments", "Non-Horse Proteins")))
# Bind annotation data frames together
contaminant_heatmap_data <- rbind(misev_cat3_forheatmap, misev_cat4_forheatmap, corecontam_forheatmap)
# Define colors to be used to annotate the groups
contaminant_heatmap_color <- list(Category = c("MISEV 3: Non-EV Co-Isolated Structures" = "#31688e", "MISEV 4: Other Intracellular Compartments" = "#35b779", "Non-Horse Proteins" = "#fde725"))
# Add an index value into the dataframe so we can add splits between groups in heatmap
contaminant_heatmap_anno$index <- 1:nrow(contaminant_heatmap_anno)
# Make lists of where breaks should be placed in the heatmap to separate clusters of samples
seprows <- contaminant_heatmap_anno %>% group_by(Category) %>% slice_max(n = 1, order_by = index)
seprows <- sort(seprows$index)
# Make heatmap
contam_heatmap_counts <- pheatmap(as.matrix(contaminant_heatmap_data),
scale = "none",
color = colorRampPalette(c("grey95", "#443983"))(4),
legend_breaks = c(0, 1, 2, 3),
cluster_cols = FALSE,
cluster_rows = FALSE,
angle_col = c("0"),
border_color = "black",
annotation_row = contaminant_heatmap_anno %>% select(Category),
annotation_colors = contaminant_heatmap_color,
annotation_names_row = FALSE,
gaps_row = seprows)
pdf(file = "3_OutputFigs/Heatmap_Contaminants_Counts.pdf",
width = 6.5, height = 6)
contam_heatmap_counts
invisible(dev.off())
contam_heatmap_counts
# Function for additional data cleaning
## Input: data frame formatted similar to data_forgraphing, vector of proteins to keep, and data frame of protein name/accession key
## Output: data frame formatted for heatmap
heatmap_data_prep_abun <- function(input_df, filtering_vec, protein_key){
# Get accessions
accession_df <- protein_key %>%
filter(protein %in% filtering_vec)
# Filter data
data <- input_df %>%
dplyr::select(c(sample_id, horse, method, all_of(accession_df$accession))) %>%
pivot_longer(-c(sample_id, horse, method), names_to = "accession", values_to = "value") %>%
left_join(accession_df, by = "accession") %>%
unite(protein_accession, protein, accession, sep = ".") %>%
group_by(method, protein_accession) %>%
summarise(mean = mean(value, na.rm = TRUE)) %>%
pivot_wider(id_cols = "method", names_from = "protein_accession", values_from = "mean") %>%
mutate(method = recode(method, "A35" = "S35", "A70" = "S70")) %>%
mutate(method = factor(method, levels = c("S35", "S70", "MC", "UC"))) %>%
column_to_rownames("method") %>%
t() %>% data.frame() %>%
rownames_to_column("protein_accession") %>%
separate(protein_accession, into = c("protein", NA), sep = "[.]") %>%
mutate(protein = make.unique(protein, sep = "_Acc")) %>%
column_to_rownames("protein")
return(data)
}
# Apply function
misev_cat3_forheatmap <- heatmap_data_prep_abun(data_forgraphing, misev_prots_cat3, accession_prot_key)
misev_cat4_forheatmap <- heatmap_data_prep_abun(data_forgraphing, misev_prots_cat4, accession_prot_key)
corecontam_forheatmap <- heatmap_data_prep_abun(data_forgraphing, corecontam_prots, accession_prot_key)
## Using annotation and color data frame from above
# Bind annotation data frames together
contaminant_heatmap_data <- rbind(misev_cat3_forheatmap, misev_cat4_forheatmap, corecontam_forheatmap)
# Make heatmap
contam_heatmap_abun <- pheatmap(as.matrix(contaminant_heatmap_data),
scale = "none",
color = colorRampPalette(c("mistyrose", "violetred4"))(100),
cluster_cols = FALSE,
cluster_rows = FALSE,
angle_col = c("0"),
border_color = "black",
annotation_row = contaminant_heatmap_anno %>% dplyr::select(Category),
annotation_colors = contaminant_heatmap_color,
annotation_names_row = FALSE,
gaps_row = seprows,
na_col = "white")
pdf(file = "3_OutputFigs/Heatmap_Contaminants_Abundance.pdf",
width = 6.5, height = 6)
contam_heatmap_abun
invisible(dev.off())
contam_heatmap_abun
Prepare vectors to filter proteins by:
# Vector of proteins to filter by - MISEV category 1
## Get names of EV proteins in category 1
misev_prots_cat1 <- misev_prots %>%
filter(category == "1 - Transmembrane or GPI-anchored proteins associated to plasma membrane and/or endosomes") %>%
pull(protein_symbol)
## Extract names with a star (to be searched against differently) in category 1
misev_prots_cat1_general <- misev_prots_cat1[str_detect(misev_prots_cat1, "[*]")]
misev_prots_cat1_general
## [1] "GNA*" "ITGA*" "ITGB*" "SDC*" "H2-A*" "CD3*"
## Get all protein names in category 1
misev_prots_cat1 <- accession_prot_key %>%
filter(protein %in% misev_prots_cat1 |
str_detect(protein, "^GNA") |
str_detect(protein, "^ITGA") |
str_detect(protein, "^ITGB") |
str_detect(protein, "^SDC") |
str_detect(protein, "^H2A") |
str_detect(protein, "^CD3")) %>%
pull(protein)
# Vector of proteins to filter by - MISEV category 2
## Get names of EV proteins in category 2
misev_prots_cat2 <- misev_prots %>%
filter(category == "2 - Cytosolic proteins recovered in EVs") %>%
pull(protein_symbol)
# Extract names with a star (to be searched against differently) in category 2
misev_prots_cat2_general <- misev_prots_cat2[str_detect(misev_prots_cat2, "[*]")]
misev_prots_cat2_general
## [1] "CHMP*" "CAV*" "EHD*" "ANXA*" "ACT*" "TUB*"
# Get accessions
misev_prots_cat2 <- accession_prot_key %>%
filter(protein %in% misev_prots_cat2 |
str_detect(protein, "^CHMP") |
str_detect(protein, "^CAV") |
str_detect(protein, "^EHD") |
str_detect(protein, "^ANXA") |
str_detect(protein, "^ACT") |
str_detect(protein, "^TUB")) %>%
pull(protein)
# Apply function
misev_cat1_forheatmap <- heatmap_data_prep_counts(data_forgraphing, misev_prots_cat1, accession_prot_key)
misev_cat2_forheatmap <- heatmap_data_prep_counts(data_forgraphing, misev_prots_cat2, accession_prot_key)
# Add group name to each data frame
misev_cat1_anno <- misev_cat1_forheatmap %>%
mutate(Category = "MISEV 1: Transmembrane/GPI Anchored") %>%
select(Category)
misev_cat2_anno <- misev_cat2_forheatmap %>%
mutate(Category = "MISEV 2: Cytosolic Proteins in EVs") %>%
select(Category)
# Bind data frames together
marker_heatmap_anno <- rbind(misev_cat1_anno, misev_cat2_anno)
# Bind annotation data frames together
marker_heatmap_data <- rbind(misev_cat1_forheatmap, misev_cat2_forheatmap)
# Define colors to be used to annotate the groups
marker_heatmap_color <- list(Category = c("MISEV 1: Transmembrane/GPI Anchored" = "#21918c", "MISEV 2: Cytosolic Proteins in EVs" = "#90d743"))
# Add an index value into the dataframe so we can add splits between groups in heatmap
marker_heatmap_anno$index <- 1:nrow(marker_heatmap_anno)
# Make lists of where breaks should be placed in the heatmap to separate clusters of samples
seprows <- marker_heatmap_anno %>% group_by(Category) %>% slice_max(n = 1, order_by = index)
seprows <- sort(seprows$index)
# Make heatmap
marker_heatmap_counts <- pheatmap(as.matrix(marker_heatmap_data),
scale = "none",
color = colorRampPalette(c("grey95", "#443983"))(4),
legend_breaks = c(0, 1, 2, 3),
cluster_cols = FALSE,
cluster_rows = FALSE,
angle_col = c("0"),
border_color = "black",
fontsize_row = 8,
annotation_row = marker_heatmap_anno %>% select(Category),
annotation_colors = marker_heatmap_color,
annotation_names_row = FALSE,
gaps_row = seprows)
pdf(file = "3_OutputFigs/Heatmap_Marker_Counts.pdf",
width = 6, height = 10)
marker_heatmap_counts
invisible(dev.off())
marker_heatmap_counts
What is the count of EV markers with presence in all three horses across each method?
marker_heatmap_data %>%
summarise(across(everything(), \(x) count (x == 3)))
## S35 S70 MC UC
## 1 39 29 6 5
# Apply function
misev_cat1_forheatmap <- heatmap_data_prep_abun(data_forgraphing, misev_prots_cat1, accession_prot_key)
misev_cat2_forheatmap <- heatmap_data_prep_abun(data_forgraphing, misev_prots_cat2, accession_prot_key)
## Using annotation and color data frame from above
# Bind annotation data frames together
marker_heatmap_data <- rbind(misev_cat1_forheatmap, misev_cat2_forheatmap)
# Make heatmap
marker_heatmap_abun <- pheatmap(as.matrix(marker_heatmap_data),
scale = "none",
color = colorRampPalette(c("mistyrose", "violetred4"))(100),
cluster_cols = FALSE,
cluster_rows = FALSE,
angle_col = c("0"),
border_color = "black",
fontsize_row = 8,
annotation_row = marker_heatmap_anno %>% select(Category),
annotation_colors = marker_heatmap_color,
annotation_names_row = FALSE,
gaps_row = seprows,
na_col = "white")
pdf(file = "3_OutputFigs/Heatmap_Marker_Abundance.pdf",
width = 6, height = 10)
marker_heatmap_abun
invisible(dev.off())
marker_heatmap_abun
# Get vector to filter by
celltype_prots <- accession_prot_key %>%
filter(str_detect(protein, "^CD[0-9]") |
str_detect(protein, "^LY6G") |
str_detect(protein, "^CCR") |
protein == "EPCAM") %>%
pull(protein)
celltype_prots
## [1] "LY6G6C" "CD82" "EPCAM" "CD48" "CD14" "CD5L" "CD55" "CD36"
## [9] "CD151" "CD44" "CD163" "CD109"
# Remove CD5L since it's not actually a cell surface molecule
celltype_prots <- celltype_prots[!celltype_prots == "CD5L"]
# Apply data cleaning function
celltype_forheatmap <- heatmap_data_prep_counts(data_forgraphing, celltype_prots, accession_prot_key)
# Make heatmap
celltype_heatmap_counts <- pheatmap(as.matrix(celltype_forheatmap),
scale = "none",
color = colorRampPalette(c("grey95", "#443983"))(4),
legend_breaks = c(0, 1, 2, 3),
cluster_cols = FALSE,
cluster_rows = FALSE,
angle_col = c("0"),
border_color = "black")
pdf(file = "3_OutputFigs/Heatmap_CellType_Counts.pdf",
width = 2.75, height = 3)
celltype_heatmap_counts
invisible(dev.off())
celltype_heatmap_counts
# Apply data cleaning function
celltype_forheatmap <- heatmap_data_prep_abun(data_forgraphing, celltype_prots, accession_prot_key)
# Make heatmap
celltype_heatmap_abun <- pheatmap(as.matrix(celltype_forheatmap),
scale = "none",
color = colorRampPalette(c("mistyrose", "violetred4"))(100),
cluster_cols = FALSE,
cluster_rows = FALSE,
angle_col = c("0"),
border_color = "black",
na_col = "white")
pdf(file = "3_OutputFigs/Heatmap_CellType_Abundance.pdf",
width = 2.75, height = 3)
celltype_heatmap_abun
invisible(dev.off())
celltype_heatmap_abun
Prepare data frame:
# Filter for proteins without any missing expression
data_filtered_nomissing <- data_filtered %>%
select(c(accession, protein, h1_a35:h5_uc)) %>%
na.omit()
Log2 transform data and assess normality.
# Prepare data
data_filtered_nomissing_log2 <- data_filtered_nomissing %>%
mutate(across(h1_a35:h5_uc, \(x) log2(x)))
data_fornormalitytest <- data_filtered_nomissing_log2 %>%
select(-protein) %>%
remove_rownames() %>%
column_to_rownames("accession") %>%
t() %>% data.frame()
# Write function for normality
normality_assessment <- function(data) {
## [1] SHAPIRO WILK TEST WITH ALL ENDPOINTS
# Test normality of each chemical
shapiro_wilk <- apply(data, 2, shapiro.test)
# Create data frame to summarize results
shapiro_wilk <- do.call(rbind.data.frame, shapiro_wilk)
shapiro_wilk <- format(shapiro_wilk, scientific = FALSE)
# Add column to adjust for multiple hypothesis testing
shapiro_wilk$p.value.adj <- p.adjust(shapiro_wilk$p.value, "BH")
# Add column for normality conclusion
shapiro_wilk <- shapiro_wilk %>% mutate(normal = ifelse(p.value.adj < 0.05, F, T))
## [2] SHAPIRO WILK TEST SUMMARY
# Make new data frame with summary values
shapiro_wilk_summ <- data.frame("count_normal" = nrow(shapiro_wilk %>% filter(p.value.adj >= 0.05)),
"count_nonnormal" = nrow(shapiro_wilk %>% filter(p.value.adj < 0.05))) %>%
mutate("perc_normal" = count_normal/(count_normal + count_nonnormal)*100)
## [3] PANEL OF HISTOGRAMS
histograms <- data %>%
pivot_longer(all_of(colnames(data)), names_to = "endpoint", values_to = "value") %>%
ggplot(aes(value)) +
geom_histogram(fill = "gray32", color = "black") +
facet_wrap(~ endpoint, scales = "free")
## [4] PANEL OF QQ PLOTS
qqplots <- ggqqplot(data %>%
pivot_longer(all_of(colnames(data)), names_to = "endpoint", values_to = "value"),
"value", facet.by = "endpoint", ggtheme = theme_bw(), scales = "free")
## STORE RESULTS
results <- list(shapiro_wilk, shapiro_wilk_summ, histograms, qqplots)
return(results)
}
# Apply normality test
normality_res <- normality_assessment(data_fornormalitytest)
# View results
normality_res[[2]]
## count_normal count_nonnormal perc_normal
## 1 62 0 100
normality_res[[3]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normality_res[[4]]
Data are normally distributed, so we will proceed with ANOVA.
# Turn off scientific notation
options(scipen=999)
# Prepare data
data_forstats <- data_fornormalitytest %>%
rownames_to_column("sample_id") %>%
separate(sample_id, into = c("horse", "method")) %>%
mutate(across(horse:method, toupper)) %>%
mutate(method = recode(method, "A35" = "S35", "A70" = "S70")) %>%
mutate(method = factor(method, levels = c("S35", "S70", "MC", "UC"))) %>%
mutate(horse = recode(horse, "H4" = "H2", "H5" = "H3"))
# Create a list of the column names (accessions) to run the ANOVA on
endpoints <- colnames(data_forstats[, 3:ncol(data_forstats)])
# Create results data frame
anova_res_nomissing <- data.frame()
# Run ANOVA
for (i in 1:length(endpoints)) {
# Assign a name to the endpoint variable.
endpoint <- endpoints[i]
# Run an one-way ANOVA and store results in res.aov
res.aov <- anova_test(data = data_forstats,
dv = paste0(endpoint),
wid = horse,
within = method)
# Extract the results we are interested in.
res_df <- data.frame(get_anova_table(res.aov)) %>%
select(p) %>%
mutate(accession = paste0(endpoint)) %>%
relocate(accession, .before = "p")
# Bind to results data frame
anova_res_nomissing <- rbind(anova_res_nomissing, res_df)
}
# Adjust p-values
anova_res_nomissing$padj <- p.adjust(anova_res_nomissing$p, method = "BH")
# How many have significant padj?
nrow(anova_res_nomissing %>% filter(padj < 0.05))
## [1] 48
# Create cleaner results data frame to write out
anova_res_nomissing_cleaned <- anova_res_nomissing %>%
left_join(accession_prot_key, by = "accession") %>%
relocate(protein, .after = "accession") %>%
mutate(across(c(p, padj), \(x) formatC(x, format = "e", digits = 2))) %>%
arrange(padj)
# Write out
write.xlsx(anova_res_nomissing_cleaned, "2_OutputTables/ANOVA_Res_NoMissing.xlsx")
Next, follow up with posthoc pairwise testing.
# Create results data frame
pairwise_ttest_res_nomissing <- data.frame()
# Run ANOVA
for (i in 1:length(endpoints)) {
# Assign a name to the endpoint variable.
endpoint <- endpoints[i]
# Extract the results we are interested in.
res_df <- data_forstats %>%
pairwise_t_test(
as.formula(paste0(paste0(endpoint), "~", "method", sep = "")),
paired = TRUE,
p.adjust.method = "BH")
# Bind to results data frame
pairwise_ttest_res_nomissing <- rbind(pairwise_ttest_res_nomissing, res_df)
}
# How many unique accessions with significant pairwise values?
length(pairwise_ttest_res_nomissing %>%
filter(p.adj < 0.05) %>%
pull(".y.") %>%
unique())
## [1] 20
Pull just accessions with significant pairwise comparisons and graph for supplemental material.
# Clean data frame to begin prepping for graphing
pairwise_ttest_nomissing_forpanel <- pairwise_ttest_res_nomissing %>%
# Filter to only significant p-values
filter(p.adj <= 0.05) %>%
# Rename columns and join protein name
rename("accession" = ".y.", "Method" = "group2") %>%
# Make new symbol for labels to differentiate between comparisons
mutate(label = group1) %>%
mutate(label = recode(group1, "S35" = "a", "S70" = "b", "MC" = "c")) %>%
# Collapse labels
group_by(accession, Method) %>%
summarise(label = paste(label, collapse = ","))
# Get names of accessions
endpoints <- pairwise_ttest_nomissing_forpanel %>%
pull("accession") %>%
unique()
# Add make unique column to accession protein key
accession_prot_key_2 <- accession_prot_key %>%
mutate(protein = make.unique(protein, sep = "_Acc"))
# Create data frame indicating where y value should be for these annotations
sig_labs_y <- data_filtered_nomissing_log2 %>%
filter(accession %in% endpoints) %>%
column_to_rownames("accession") %>%
select(-protein) %>%
t() %>% as.data.frame() %>%
summarise(across(F7BUV8:F7CN11, \(x) max(x))) %>%
t() %>% as.data.frame() %>%
rownames_to_column("accession") %>%
rename("y_pos" = "V1") %>%
mutate(y_pos = y_pos*1.2) %>%
left_join(accession_prot_key_2, by = "accession")
# Join to labeling dataframe to complete what is needed to make the annotations
pairwise_ttest_nomissing_forpanel <- pairwise_ttest_nomissing_forpanel %>%
left_join(sig_labs_y, by = "accession")
# Pivot data longer and format for graphing
ttest_nomissing_data_forpanel <- data_filtered_nomissing_log2 %>%
filter(accession %in% endpoints) %>%
select(-protein) %>%
pivot_longer(!accession, names_to = "sample", values_to = "value") %>%
left_join(accession_prot_key_2, by = "accession") %>%
separate(sample, into = c("Horse", "Method")) %>%
mutate(across(c(Horse:Method), \(x) toupper(x))) %>%
mutate(Method = recode(Method, "A35" = "S35", "A70" = "S70")) %>%
mutate(Method = factor(Method, levels = c("S35", "S70", "MC", "UC"))) %>%
mutate(Horse = recode(Horse, "H4" = "H2", "H5" = "H3"))
# Make panel
ttest_panel <- ggplot(ttest_nomissing_data_forpanel) +
geom_boxplot(aes(x = Method, y = value, fill = Method), color = "black", outlier.shape = NA) +
scale_fill_viridis_d(begin = 0.25, end = 1, option = "D", name = "Method") +
geom_jitter(aes(x = Method, y = value, shape = Horse), color = "black",
position = position_jitter(0.15), size = 2) +
scale_shape_manual(values = c(15, 16, 17), name = "Horse") +
geom_text(data = pairwise_ttest_nomissing_forpanel, aes(x = Method, y = y_pos, label = label),
size = 4, hjust = 0.5, fontface = "bold") +
scale_y_continuous(expand = expansion(mult = c(0.1, 0.35))) +
ylab(expression(Log[2]*"(Relative Abundance)")) +
facet_wrap(~ protein, scales = "free_y", nrow = 5) +
theme(axis.title.x = element_blank(),
strip.background = element_rect(fill = "gray28"),
strip.text = element_text(color = "white", face = "bold", size = 11),
axis.text = element_text(color = "black"),
axis.text.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
legend.position = "none")
ttest_panel
ggsave("3_OutputFigs/TTest_Panel_NoMissing.png",
width = 9,
height = 9,
units = c("in"))
We can also plot these proteins as a heatmap (for main body of manuscript):
# Filter data
data_anovasig_nomissing_forheatmap <- data_forstats %>%
unite(horse_method, horse, method, sep = "_", remove = FALSE)
sample_anno <- data_anovasig_nomissing_forheatmap %>%
select(c(horse_method:method)) %>%
dplyr::rename("Method" = "method", "Horse" = "horse") %>%
column_to_rownames("horse_method")
data_anovasig_nomissing_forheatmap <- data_anovasig_nomissing_forheatmap %>%
column_to_rownames("horse_method") %>%
select(-c(method, horse)) %>%
t() %>% data.frame() %>%
rownames_to_column("accession") %>%
left_join(accession_prot_key_2, by = "accession") %>%
column_to_rownames("protein") %>%
select(-accession)
# Define colors to be used to annotate the groups
heatmap_color <- list(Method = c("S35" = "#3b528b", "S70" = "#21918c",
"MC" = "#5ec962", "UC" = "#fde725"),
Horse = c("H1" = "grey75", "H2" = "grey55", "H3" = "grey35"))
# Make heatmap
heatmap_nomissing <- pheatmap(as.matrix(data_anovasig_nomissing_forheatmap),
scale = "none",
color = colorRampPalette(c("mistyrose", "violetred4"))(100),
angle_col = c("0"),
border_color = "black",
fontsize_row = 8,
show_colnames = FALSE,
annotation_col = sample_anno,
annotation_colors = heatmap_color,
annotation_names_col = FALSE)
pdf(file = "3_OutputFigs/Heatmap_AllProts_NoMissing.pdf",
width = 7, height = 9)
heatmap_nomissing
invisible(dev.off())
heatmap_nomissing
PCA plot:
# Prep data
data_for_pca_nomissing <- data_forstats %>%
unite(horse_method, horse, method, sep = "_") %>%
column_to_rownames("horse_method") %>%
t() %>% data.frame() %>%
rownames_to_column("accession") %>%
left_join(accession_prot_key_2, by = "accession") %>%
select(-accession) %>%
column_to_rownames("protein") %>% t()
# Run PCA
pca_res_counts <- prcomp(data_for_pca_nomissing)
# PCA plot
pca_plot_nomissing <- fviz_pca_ind(pca_res_counts,
label = "none",
alpha = 0,
mean.point = FALSE) +
geom_point(aes(shape = sample_anno$Horse, color = sample_anno$Method), size = 4) +
scale_shape_manual(values = c(15, 16, 17), name = "Horse") +
scale_color_viridis_d(begin = 0.25, end = 1, option = "D", name = "Method") +
labs(title = "Proteins with No Missing Data",
subtitle = "(n = 62 proteins)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(color = "black"),
panel.border = element_rect(fill = NA, color = "black", linewidth = 0.3),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12))
# Write out plot
pdf(file = "3_OutputFigs/PCAPlot_Log2DataNoMissing.pdf",
width = 5, height = 3.75)
pca_plot_nomissing
invisible(dev.off())
pca_plot_nomissing
Plot contributions to dimension 1, since that is where we see most of the variability:
pca_contrib_plot_nomissing <- fviz_contrib(pca_res_counts, choice = "var", axes = 1, top = 10,
color = "black",
fill = "grey70") +
ggtitle("Contribution of Variables to Dim1") +
theme_set(theme_bw()) +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 13, margin = ggplot2::margin(r = 10)),
axis.text.x = element_text(angle = 45, hjust = 1, color = "black", size = 11),
axis.text.y = element_text(color = "black", size = 11),
plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "inches"))
# Write out plot
pdf(file = "3_OutputFigs/PCAContrib_Log2DataNoMissing.pdf",
width = 5, height = 3.75)
pca_contrib_plot_nomissing
invisible(dev.off())
pca_contrib_plot_nomissing
PCA contributions to scatter plot:
labels_pca <- c("SFTPD", "APOE", "SFTPB", "APOA1_Acc2", "IGHM_Acc1", "IGHM_Acc4", "GC")
pca_biplot <- fviz_pca_biplot(pca_res_counts, label ="var",
invisible = "ind",
col.var = "black",
select.var = list(name = labels_pca),
alpha.var = 0.3,
repel = TRUE) +
labs(title = "Proteins with No Missing Data",
subtitle = "(n = 62 proteins)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(color = "black"),
panel.border = element_rect(fill = NA, color = "black", linewidth = 0.3),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12))
pdf(file = "3_OutputFigs/PCABiplot_Log2DataNoMissing.pdf",
width = 5, height = 4)
pca_biplot
invisible(dev.off())
Import functions:
# Load GSimp functions
options(stringsAsFactors = F)
source('~/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/Rager_Lab/Projects_Lead/4_HorseBALF_EVs/3_IsolationOptimizationExperiments/6_DataAnalysis/2_Proteomics/4_GSimpFunctions/Trunc_KNN/Imput_funcs.r', local = TRUE)
source('~/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/Rager_Lab/Projects_Lead/4_HorseBALF_EVs/3_IsolationOptimizationExperiments/6_DataAnalysis/2_Proteomics/4_GSimpFunctions/GSimp_evaluation.R', local = TRUE)
source('~/Library/CloudStorage/OneDrive-UniversityofNorthCarolinaatChapelHill/Rager_Lab/Projects_Lead/4_HorseBALF_EVs/3_IsolationOptimizationExperiments/6_DataAnalysis/2_Proteomics/4_GSimpFunctions/GSimp.R', local = TRUE)
Prepare input data. The input for this function is the detection-filtered data frame with the minimum values added as the last row in the data frame (proteins in columns and samples in rows). Note that the imputation will take a few minutes to run.
# Make a vector of the minimum values for each accession
min_detect <- data_filtered %>%
dplyr::select(c(accession, h1_a35:h5_uc)) %>%
pivot_longer(!accession, names_to = "sample", values_to = "value") %>%
group_by(accession) %>%
summarise(mdl = min(value, na.rm = TRUE))
# Add values to data frame
data_filtered_forGSimp <- data_filtered %>%
dplyr::select(c(accession, h1_a35:h5_uc)) %>%
left_join(min_detect, by = "accession") %>%
column_to_rownames("accession") %>%
t() %>% as.data.frame()
# Apply function
set.seed(8016)
data_filtered_imp <- pre_processing_GS_wrapper(data_filtered_forGSimp)
## Iteration 1 start...end!
## Iteration 2 start...end!
## Iteration 3 start...end!
## Iteration 4 start...end!
## Iteration 5 start...end!
## Iteration 6 start...end!
## Iteration 7 start...end!
## Iteration 8 start...end!
## Iteration 9 start...end!
## Iteration 10 start...end!
# Prepare Log2 Data
data_filtered_imp_log2_forpca <- log2(data_filtered_imp) %>%
rownames_to_column("horse_method") %>%
separate(horse_method, into = c("Horse", "Method")) %>%
mutate(across(c(Horse:Method), \(x) toupper(x))) %>%
mutate(Method = recode(Method, "A35" = "S35", "A70" = "S70")) %>%
mutate(Method = factor(Method, levels = c("S35", "S70", "MC", "UC"))) %>%
mutate(Horse = recode(Horse, "H4" = "H2", "H5" = "H3")) %>%
unite(horse_method, Horse, Method, remove = TRUE) %>%
column_to_rownames("horse_method") %>%
t() %>% data.frame() %>%
rownames_to_column("accession") %>%
left_join(accession_prot_key, by = "accession") %>%
mutate(protein = make.unique(protein, "_Acc")) %>%
select(-accession) %>%
column_to_rownames("protein") %>% t()
# Run PCA
pca_res_imp <- prcomp(as.matrix(data_filtered_imp_log2_forpca))
# PCA plot
pca_plot_imp <- fviz_pca_ind(pca_res_imp,
label = "none",
alpha = 0,
mean.point = FALSE) +
geom_point(aes(shape = sample_anno$Horse, color = sample_anno$Method), size = 4) +
scale_shape_manual(values = c(15, 16, 17), name = "Horse") +
scale_color_viridis_d(begin = 0.25, end = 1, option = "D", name = "Method") +
labs(title = "All Proteins Passing Detection Filter",
subtitle = "(n = 564 proteins)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(color = "black"),
panel.border = element_rect(fill = NA, color = "black", linewidth = 0.3),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12))
# Write out plot
pdf(file = "3_OutputFigs/PCAPlot_ImpLog2Data.pdf",
width = 5, height = 3.75)
pca_plot_imp
invisible(dev.off())
pca_plot_imp
Plot contributions:
pca_contrib_plot_imp <- fviz_contrib(pca_res_imp, choice = "var", axes = 1, top = 10,
color = "black",
fill = "grey70") +
ggtitle("Contribution of Variables to Dim1") +
theme_set(theme_bw()) +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 13, margin = ggplot2::margin(r = 10)),
axis.text.x = element_text(angle = 45, hjust = 1, color = "black", size = 11),
axis.text.y = element_text(color = "black", size = 11),
plot.margin = unit(c(0.2, 0.2, 0.2, 0.2), "inches"))
# Write out plot
pdf(file = "3_OutputFigs/PCAContrib_ImpLog2Data.pdf",
width = 5, height = 3.75)
pca_contrib_plot_imp
invisible(dev.off())
pca_contrib_plot_imp
PCA contributions to scatter plot:
# Select labels
labels_pca_imp <- c("MUC1", "GPRC5C", "SDCBP", "SFTPD", "MFGE8", "GNB1", "RAC1")
pca_biplot_imp <- fviz_pca_biplot(pca_res_imp, label ="var",
invisible = "ind",
col.var = "black",
select.var = list(name = labels_pca_imp),
alpha.var = 0.3,
repel = TRUE) +
labs(title = "All Proteins Passing Detection Filter",
subtitle = "(n = 564 proteins)") +
theme(axis.title = element_text(size = 14),
axis.text = element_text(color = "black"),
panel.border = element_rect(fill = NA, color = "black", linewidth = 0.3),
panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
plot.subtitle = element_text(hjust = 0.5, size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 12))
pdf(file = "3_OutputFigs/PCABiplot_ImpLog2Data.pdf",
width = 5, height = 4)
pca_biplot_imp
invisible(dev.off())
# Log2 and prep data
data_filtered_imp_log2 <- log2(data_filtered_imp) %>%
rownames_to_column("horse_method") %>%
separate(horse_method, into = c("Horse", "Method")) %>%
mutate(across(c(Horse:Method), \(x) toupper(x))) %>%
mutate(Method = recode(Method, "A35" = "S35", "A70" = "S70")) %>%
mutate(Method = factor(Method, levels = c("S35", "S70", "MC", "UC"))) %>%
mutate(Horse = recode(Horse, "H4" = "H2", "H5" = "H3")) %>%
unite(horse_method, Horse, Method, remove = TRUE) %>%
column_to_rownames("horse_method")
# Prep data
data_filtered_forheatmap <- data_filtered_imp_log2 %>% t()
# Make heatmap
allprots_heatmap <- pheatmap(data_filtered_forheatmap,
scale = "none",
color = colorRampPalette(c("mistyrose", "violetred4"))(100),
angle_col = c("0"),
border_color = "black",
show_colnames = FALSE,
show_rownames = FALSE,
annotation_col = sample_anno,
annotation_colors = heatmap_color,
annotation_names_col = FALSE)
allprots_heatmap
pdf(file = "3_OutputFigs/Heatmap_AllProts_Imp.pdf",
width = 7, height = 9)
allprots_heatmap
invisible(dev.off())
allprots_heatmap
Protein lists by method for IPA input and visualization (keeping proteins with detectable protein in 2 out of 3 horses). Note that for these, I kept the prefix as A (as in AFC) rather than S (SEC), so as to not need to repeat the IPA analysis, but they are equivalent.
# Make list of all contaminating proteins to remove
all_contams <- c(misev_prots_cat3, misev_prots_cat4, corecontam_prots)
# Filter and export data
a35_keep_forIPA <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(accession %in% keep_accession) %>%
filter(a35 < 2) %>%
left_join(accession_prot_key, by = "accession") %>%
distinct(protein) %>%
filter(!protein %in% all_contams) %>%
select(protein)
write.xlsx(a35_keep_forIPA, "2_OutputTables/ProteinsForIPA_A35.xlsx")
a70_keep_forIPA <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(accession %in% keep_accession) %>%
filter(a70 < 2) %>%
left_join(accession_prot_key, by = "accession") %>%
distinct(protein) %>%
filter(!protein %in% all_contams) %>%
select(protein)
write.xlsx(a70_keep_forIPA, "2_OutputTables/ProteinsForIPA_A70.xlsx")
mc_keep_forIPA <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(accession %in% keep_accession) %>%
filter(mc < 2) %>%
left_join(accession_prot_key, by = "accession") %>%
distinct(protein) %>%
filter(!protein %in% all_contams) %>%
select(protein)
write.xlsx(mc_keep_forIPA, "2_OutputTables/ProteinsForIPA_MC.xlsx")
uc_keep_forIPA <- na_count_bymethod %>%
rownames_to_column("accession") %>%
filter(accession %in% keep_accession) %>%
filter(uc < 2) %>%
left_join(accession_prot_key, by = "accession") %>%
distinct(protein) %>%
filter(!protein %in% all_contams) %>%
select(protein)
write.xlsx(uc_keep_forIPA, "2_OutputTables/ProteinsForIPA_UC.xlsx")
Venn Diagram of Proteins:
# Prepare data
venn_protein_input <- list(
S35 = a35_keep_forIPA %>% pull(protein),
S70 = a70_keep_forIPA %>% pull(protein),
MC = mc_keep_forIPA %>% pull(protein),
UC = uc_keep_forIPA %>% pull(protein)
)
# Graph Data
protein_venn <- ggvenn(venn_protein_input,
fill_color = c("#3b528b", "#21918c","#5ec962", "#fde725"),
show_percentage = FALSE,
fill_alpha = 0.5,
text_size = 7)
pdf("3_OutputFigs/VennDiagram_Protein.pdf",
width = 5, height = 5)
protein_venn
invisible(dev.off())
protein_venn
Pathways analysis
# Import data
ipa_paths_a35 <- read.xlsx("1_InputData/IPA_A35.xlsx") %>% clean_names() %>% mutate(method = "S35")
ipa_paths_a70 <- read.xlsx("1_InputData/IPA_A70.xlsx") %>% clean_names() %>% mutate(method = "S70")
ipa_paths_mc <- read.xlsx("1_InputData/IPA_MC.xlsx") %>% clean_names() %>% mutate(method = "MC")
ipa_paths_uc <- read.xlsx("1_InputData/IPA_UC.xlsx") %>% clean_names() %>% mutate(method = "UC")
# Bind together
ipa_paths_all <- rbind(ipa_paths_a35, ipa_paths_a70, ipa_paths_mc, ipa_paths_uc)
# Distribution of p-values
ipa_paths_all %>%
ggplot(aes(log_b_h_p_value)) +
geom_histogram(fill = "gray32", color = "black") +
facet_wrap(~ method) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 307 rows containing non-finite values (`stat_bin()`).
# Distribution of ratios
ipa_paths_all %>%
ggplot(aes(ratio)) +
geom_histogram(fill = "gray32", color = "black") +
facet_wrap(~ method) +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# How many pathways have p < 0.05?
ipa_paths_all %>%
group_by(method) %>%
dplyr::summarize(n = sum(log_b_h_p_value < 1.3))
## # A tibble: 4 × 2
## method n
## <chr> <int>
## 1 MC 501
## 2 S35 461
## 3 S70 404
## 4 UC 379
# What is the overlap in pathways between methods?
venn_protein_input <- list(
S35 = ipa_paths_a35 %>% filter(log_b_h_p_value > 1.3) %>% pull(ingenuity_canonical_pathways),
S70 = ipa_paths_a70 %>% filter(log_b_h_p_value > 1.3) %>% pull(ingenuity_canonical_pathways),
MC = ipa_paths_mc %>% filter(log_b_h_p_value > 1.3) %>% pull(ingenuity_canonical_pathways),
UC = ipa_paths_uc %>% filter(log_b_h_p_value > 1.3) %>% pull(ingenuity_canonical_pathways)
)
# Graph Data
pathway_venn <- ggvenn(venn_protein_input,
fill_color = c("#3b528b", "#21918c","#5ec962", "#fde725"),
show_percentage = FALSE,
fill_alpha = 0.5,
text_size = 7)
pdf("3_OutputFigs/VennDiagram_Pathway.pdf",
width = 5, height = 5)
pathway_venn
invisible(dev.off())
pathway_venn
Plot top pathways, where bars are highlighted if they are shared between all of the methods in significance.
# Get pathways that overlap
overlap_pathways <- base::Reduce(intersect, venn_protein_input)
# Write function for graphing
# This function takes your input data frame (data), colors for the gradient scale (high color and low color), the number of pathways you want (n), and your comparison subtitle (subtitle) as input
# And outputs the graph of interest
graph_top_canon_2 <- function(data, n, title) {
data %>%
slice_head(n = n) %>%
mutate(shared_path = ifelse(ingenuity_canonical_pathways %in% overlap_pathways, "Yes", "No")) %>%
mutate(shared_path = factor(shared_path, levels = c("Yes", "No"))) %>%
mutate(n_molecules = count.fields(textConnection(molecules), sep = ",")) %>%
ggplot(aes(x =log_b_h_p_value, y = reorder(ingenuity_canonical_pathways, log_b_h_p_value), fill = shared_path)) +
scale_x_continuous(limits = c(0, 70), expand = c(0, 0)) +
scale_fill_manual(values = c("violetred4", "grey80")) +
geom_bar(colour="black", stat = "identity", width = 0.6) +
geom_text(aes(label = n_molecules), hjust = -0.3, size = 9) +
labs(title = paste0(title), x = "-log(BH p-value)", fill = "Shared\nPathway?") +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.line = element_line(colour = "black"),
plot.title = element_text(hjust = 0.5, face = "bold", size = 34),
axis.title.y = element_blank(),
axis.text.y = element_text(colour = "black", size = 26),
axis.title.x = element_text(size = 26),
axis.text.x = element_text(colour = "black", size = 22),
legend.title = element_text(size = 22),
legend.text = element_text(size = 22),
legend.key.size = unit(8, 'mm'),
panel.border = element_blank(),
legend.position = c(0.8, 0.4),
plot.margin = unit(c(0.3, 0.3, 0.3, 0.3), "inches")
)
}
plot_canon_a35_2 <- graph_top_canon_2(ipa_paths_a35, 5, "S35")
plot_canon_a70_2 <- graph_top_canon_2(ipa_paths_a70, 5, "S70")
plot_canon_mc_2 <- graph_top_canon_2(ipa_paths_mc, 5, "MC")
plot_canon_uc_2 <- graph_top_canon_2(ipa_paths_uc, 5, "UC")
ipa_canon_panel_2 <- plot_canon_a35_2 / plot_canon_a70_2 / plot_canon_mc_2 / plot_canon_uc_2
pdf("3_OutputFigs/IPA_Canon_Panel.pdf",
width = 17, height = 17)
ipa_canon_panel_2
invisible(dev.off())
Alias search:
cd9_aliases <- c("TSPAN29", "MIC3", "MRP1", "P24", "BA2", "5H9", "DRAP27", "BTCC1")
cd63_aliases <- c("TSPAN30", "ME491", "PLTGP40", "HOP26", "LIMP1", "AD1", "LAMP3", "MLA1", "AD1")
cd81_aliases <- c("TSPAN28", "TAPA1", "S5.7", "S5", "S57", "CVID6")
data_filtered %>% filter(protein %in% cd9_aliases)
## [1] accession description
## [3] protein contaminant
## [5] modifications gocc_horse
## [7] gocc_contains_extracellular gocc_contains_vesicle
## [9] gocc_contains_exosome found_in_exocarta_database_human
## [11] found_in_vescarta_database_human coverage_percent
## [13] number_peptides number_unique_peptides
## [15] number_ps_ms number_a_as
## [17] mw_k_da h1_a35
## [19] h4_a35 h5_a35
## [21] h1_a70 h4_a70
## [23] h5_a70 h1_mc
## [25] h4_mc h5_mc
## [27] h1_uc h4_uc
## [29] h5_uc pool_r1
## [31] pool_r2 pool_r3
## <0 rows> (or 0-length row.names)
data_filtered %>% filter(protein %in% cd63_aliases)
## [1] accession description
## [3] protein contaminant
## [5] modifications gocc_horse
## [7] gocc_contains_extracellular gocc_contains_vesicle
## [9] gocc_contains_exosome found_in_exocarta_database_human
## [11] found_in_vescarta_database_human coverage_percent
## [13] number_peptides number_unique_peptides
## [15] number_ps_ms number_a_as
## [17] mw_k_da h1_a35
## [19] h4_a35 h5_a35
## [21] h1_a70 h4_a70
## [23] h5_a70 h1_mc
## [25] h4_mc h5_mc
## [27] h1_uc h4_uc
## [29] h5_uc pool_r1
## [31] pool_r2 pool_r3
## <0 rows> (or 0-length row.names)
data_filtered %>% filter(protein %in% cd81_aliases)
## [1] accession description
## [3] protein contaminant
## [5] modifications gocc_horse
## [7] gocc_contains_extracellular gocc_contains_vesicle
## [9] gocc_contains_exosome found_in_exocarta_database_human
## [11] found_in_vescarta_database_human coverage_percent
## [13] number_peptides number_unique_peptides
## [15] number_ps_ms number_a_as
## [17] mw_k_da h1_a35
## [19] h4_a35 h5_a35
## [21] h1_a70 h4_a70
## [23] h5_a70 h1_mc
## [25] h4_mc h5_mc
## [27] h1_uc h4_uc
## [29] h5_uc pool_r1
## [31] pool_r2 pool_r3
## <0 rows> (or 0-length row.names)
# First, clean up all data dataframe
all_data_forsupp <- all_data %>%
select(-c(contaminant, modifications, contaminant:gocc_contains_exosome, number_peptides,
number_ps_ms, number_a_as, pool_r1:pool_r3)) %>%
rename("Accession" = "accession", "Description" = "description", "Protein" = "protein",
"Human_Exocarta" = "found_in_exocarta_database_human", "Human_Vescarta" = "found_in_vescarta_database_human",
"Coverage_Percent" = "coverage_percent", "Number_Unique_Peptides" = "number_unique_peptides",
"Molecular_Weight_kDa" = "mw_k_da") %>%
mutate(Present_AllSamps = ifelse(Accession %in% count_nomissing$accession == TRUE, "Yes", "No")) %>%
mutate(Passed_DetFilter = ifelse(Accession %in% keep_accession == TRUE, "Yes", "No")) %>%
relocate(c(Present_AllSamps, Passed_DetFilter), .after = "Molecular_Weight_kDa") %>%
rename_with(.fn = ~ to_screaming_snake_case(., sep_out = ""), .cols = h1_a35:h5_uc) %>%
rename_with(.fn = ~ sub("^(.{2})", "\\1_", .), .cols = H1A35:H5UC) %>%
rename_with(.fn = ~gsub("A35", "S35", .), .cols = H1_A35:H5_UC) %>%
rename_with(.fn = ~gsub("A70", "S70", .), .cols = H1_S35:H5_UC) %>%
rename_with(.fn = ~gsub("H4", "H2", .), .cols = H1_S35:H5_UC) %>%
rename_with(.fn = ~gsub("H5", "H3", .), .cols = H1_S35:H5_UC)
# Add _imp to the imputed dataframe
data_imp_forsupp <- data_filtered_imp %>%
t() %>% data.frame() %>%
rename_with(.fn = ~ to_screaming_snake_case(., sep_out = ""), .cols = h1_a35:h5_uc) %>%
rename_with(.fn = ~ sub("^(.{2})", "\\1_", .), .cols = H1A35:H5UC) %>%
rename_with(.fn = ~gsub("A35", "S35", .), .cols = H1_A35:H5_UC) %>%
rename_with(.fn = ~gsub("A70", "S70", .), .cols = H1_S35:H5_UC) %>%
rename_with(.fn = ~gsub("H4", "H2", .), .cols = H1_S35:H5_UC) %>%
rename_with(.fn = ~gsub("H5", "H3", .), .cols = H1_S35:H5_UC) %>%
rename_with(~paste0(., "_imp")) %>%
mutate(across(everything(), \(x) round(x, digits = 0))) %>%
rownames_to_column("Accession")
# All data
write.xlsx(all_data_forsupp, "2_OutputTables/Protein_Data_ForSupp_All.xlsx")
# Filtered data
filtered_data_forsupp <- all_data_forsupp %>%
right_join(data_imp_forsupp, by = "Accession")
write.xlsx(filtered_data_forsupp, "2_OutputTables/Protein_Data_ForSupp_Filtered.xlsx")